import IPython.core.display as di
# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">TOGGLE TO SEE CODES</button>''', raw=True)
#Imports
import numpy as np
import pandas as pd
#For linear_kernel
from sklearn.metrics.pairwise import linear_kernel
# Import CountVectorizer for count matrix
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore")
#Read data from CSV file
nf = pd.read_csv('netflix_titles.csv')
# nf
nf = nf.fillna('')
nf_tv = nf.loc[nf['type']=='TV Show', :]
nf_movie = nf.loc[nf['type']=='Movie', :]
#pd.options.mode.chained_assignment = None # default='warn'
##### `Key terms:`
##### `- Term Frequency (TF): `
##### + The number of times a word appears in a document divded by the total number of words in the document.
##### + Every document has its own term frequency.
##### `- Inverse Data Frequency (IDF): `
##### + The log of the number of documents divided by the number of documents that contain the word w.
##### + Inverse data frequency determines the weight of rare words across all
##### `Steps:`
##### 1. Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
##### 2. Construct the required TF-IDF matrix by fitting and transforming the data
##### 3. Compute the cosine similarity matrix
##### 4. Construct a reverse map of indices and movie titles
from sklearn.feature_extraction.text import TfidfVectorizer
# 1
tfidf = TfidfVectorizer(stop_words='english')
# 2
tfidf_matrix = tfidf.fit_transform(nf_movie['description'])
# tfidf_matrix.shape
# 3
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
indices = pd.Series(nf_movie.index, index=nf_movie['title']).drop_duplicates() #Construct a reverse map of indices and movie titles
# indices
def get_recommendations(title, cosine_sim=cosine_sim):
idx = indices.get(title.title())
if idx is None:
return "Movie is not found."
# Get the pairwsie similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
sim_scores = sim_scores[1:11]
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar movies
return nf_movie['title'].iloc[movie_indices]
get_recommendations('')
1.2. Test case-sensitive: Recommendation for movie "The ConjUring"¶get_recommendations('The ConjUring')
1.3. Test non-letter characters: Recommendation for movie "#Alive"¶get_recommendations('#Alive')
#### `Steps:`
#### 1. Feature Selection: Define the selected features: 'title', 'director', 'cast', 'listed_in', and 'description'.
#### 2. Combined Feature Creation
#### 3. Count Matrix Creation: Import CountVectorizer from ScikitLearn.
#### + Instantiate CountVectorizer with stop_words set to 'english'.
#### + Create the count matrix using fit_transform on 'combined_features'.
#### 4. Cosine Similarity Calculation: determine how similar each item is to every other item
#### + Compute the cosine similarity matrix based on the count_matrix.
#### + 0 means no similarity
#### + 1 means that both the items are 100% similar.
#### 5. Reverse Index Mapping: map indices back to movie titles when recommending movies
#### + Create a Series of indices mapped to movie titles.
#### + Drop duplicate indices to establish a unique mapping.
def recommendations_movie(title, cosine_sim=cosine_sim):
idx = indices.get(title.title())
if idx is None:
return "Movie not found."
# Get the pairwsie similarity scores of all movies with that movie
sim_scores = list(enumerate(cosine_sim[idx]))
# Sort the movies based on the similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Get the scores of the 10 most similar movies
sim_scores = sim_scores[1:11]
# Get the movie indices
movie_indices = [i[0] for i in sim_scores]
# Return the top 10 most similar movies
return nf_movie['title'].iloc[movie_indices]
# 1
features=['title', 'director','cast','listed_in', 'description']
df_features = nf_movie[features]
# 2
def combined_features(x):
return x['title'] + ' ' + x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']
df_features['combined_features'] = df_features.apply(lambda x: ' '.join(x), axis=1)
# 3
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_features['combined_features'])
# 4
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2 = cosine_similarity(count_matrix)
# 5
df_features = df_features.reset_index()
indices = pd.Series(df_features.index, index=df_features['title']).drop_duplicates()
# indices
2.1. Recommendation for movie: "The Conjuring"¶recommendations_movie('The Conjuring', cosine_sim2)
2.2. Recommendation for movie: "#Alive"¶recommendations_movie('#Alive', cosine_sim2)
2.3. Recommendation for movie: "High & Low The Movie"¶recommendations_movie('High & Low The Movie', cosine_sim2)
# nf_tv
# 1
features=['title', 'director','cast','listed_in', 'description']
df_features = nf_tv[features]
# 2
def combined_features(x):
return x['title'] + ' ' + x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']
df_features['combined_features'] = df_features.apply(lambda x: ' '.join(x), axis=1)
# 3
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_features['combined_features'])
# 4
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim2_tv = cosine_similarity(count_matrix)
# 5
df_features = df_features.reset_index()
indices = pd.Series(df_features.index, index=df_features['title']).drop_duplicates()
# indices
def recommendations_tv(title, cosine_sim=cosine_sim2_tv):
idx = indices.get(title.title())
if idx is None:
return "TV Show not found."
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:11]
tv_indices = [i[0] for i in sim_scores]
return nf_tv['title'].iloc[tv_indices]
recommendations_tv('Kingdom', cosine_sim2_tv)
2. Recommendation for show: "Stranger Things"¶recommendations_tv('Stranger Things', cosine_sim2_tv)
# *Using TF-IDF: TfidfConvertizer*
# features=['title', 'director','cast','listed_in', 'description']
# df_features_tv = nf_tv[features]
# def combined_features(x):
# return x['title'] + ' ' + x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']
# df_features_tv['combined_features'] = df_features_tv.apply(combined_features, axis=1)
# tfidf = TfidfVectorizer(stop_words='english')
# tfidf_matrix_tv = tfidf.fit_transform(df_features_tv['combined_features'])
# cosine_sim2_tv = linear_kernel(tfidf_matrix_tv, tfidf_matrix_tv)
# df_features_tv = df_features_tv.reset_index()
# indices = pd.Series(df_features_tv.index, index=df_features_tv['title']).drop_duplicates()
# def recommendations_tv(title, cosine_sim=cosine_sim):
# idx = indices.get(title.title())
# if idx is None:
# return "TV show not found."
# # Get the pairwsie similarity scores of all movies with that movie
# sim_scores = list(enumerate(cosine_sim[idx]))
# # Sort the movies based on the similarity scores
# sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# # Get the scores of the 10 most similar movies
# sim_scores = sim_scores[1:11]
# # Get the movie indices
# tv_indices = [i[0] for i in sim_scores]
# # Return the top 10 most similar movies
# return nf_tv['title'].iloc[tv_indices]